Set pipeline:

Load preprocessed dataset (preprocessing code in /../FirstYearReview/data_preprocessing.Rmd)

Keep DE genes

datExpr = datExpr %>% filter(rownames(.) %in% rownames(DE_info)[DE_info$padj<0.05])
rownames(datExpr) = datGenes$feature_id[DE_info$padj<0.05 & !is.na(DE_info$padj)]
datGenes = datGenes %>% filter(feature_id %in% rownames(DE_info)[DE_info$padj<0.05])
DE_info = DE_info %>% filter(padj<0.05)

print(paste0('Keeping ', nrow(datExpr), ' genes and ', ncol(datExpr), ' samples'))
## [1] "Keeping 3000 genes and 80 samples"




Define a gene co-expression similarity


Using Biweight midcorrelation

allowWGCNAThreads()
## Allowing multi-threading with up to 64 threads.
cor_mat = datExpr %>% t %>% bicor

Correcting the correlation matrix from \(s \in [-1,1]\) to \(s \in [0,1]\) using \(s_{ij}=|bw(i,j)|\)

S = abs(cor_mat)

Clustering

Identifying gene modules

Using 1-S sa distance matrix

diss_S = 1-S
dend = diss_S %>% as.dist %>% hclust(method='average')
plot(dend, hang=0, labels=FALSE)

Using Dynamic Tree Cut, a dynamic branch cutting approach taken from Dynamic Tree Cut: in-depth description, tests and applications

modules = cutreeDynamicTree(dend, deepSplit=F, minModuleSize=20)
names(modules) = labels(dend)

clusterings[['bicor']] = modules

Merging modules with similar expression profiles

Calculate the “eigengenes” (1st principal component) of each module and merging similar modules

module_colors = c('gray', viridis(max(modules)))

MEs_output = datExpr %>% t %>% moduleEigengenes(colors=module_colors[as.vector(modules[labels(dend)])+1])
MEs = MEs_output$eigengenes
colnames(MEs) = sapply(colnames(MEs), function(x) paste0('Mod',which(module_colors == substring(x,3))) )

# Merge similar modules
bicor_dist = 1-bicor(MEs)
dend_MEs = bicor_dist %>% as.dist %>% hclust(method='average')
dend_MEs %>% as.dendrogram %>% set('labels', rep('', nrow(bicor_dist))) %>% plot(ylim=c(0, 0.6))
abline(h=0.5, col='#0099cc')
abline(h=0.2, col='#009999')

# Two main modules
tree_cut = cutree(dend_MEs, h=0.5)
top_modules = modules %>% replace(modules %in% (gsub('Mod','', names(tree_cut[tree_cut==1])) %>% as.numeric), 1) %>%
                          replace(modules %in% (gsub('Mod','', names(tree_cut[tree_cut==2])) %>% as.numeric), 2)
clusterings[['bicor_top_clusters']] = top_modules

# Closest modules
tree_cut = cutree(dend_MEs, h=0.2)
merged_modules = modules
n=0
for(i in sort(unique(tree_cut))){
  n=n+1
  merged_modules = merged_modules %>% 
                   replace(modules %in% (gsub('Mod','', names(tree_cut[tree_cut==i])) %>% as.numeric), n)
}

clusterings[['bicor_merged']] = merged_modules
top_module_colors = c('gray', viridis(max(top_modules)))
merged_module_colors = c('gray', viridis(max(merged_modules)))

dend_colors = data.frame('ID'=names(modules[labels(dend)]),
                         'OriginalModules' = module_colors[modules[dend$order]+1],
                         'MergedModules' = merged_module_colors[merged_modules[dend$order]+1],
                         'TopModules' = top_module_colors[top_modules[dend$order]+1])

dend %>% as.dendrogram(hang=0) %>% set('labels', rep('', nrow(diss_S))) %>% plot(ylim=c(min(dend$height),1))
colored_bars(colors=dend_colors[,-1])

rm(MEs, modules, module_colors, MEs_output, top_modules, merged_modules, tree_cut, top_module_colors, dend_colors, 
   i, diss_S)


Using Pearson correlation

cor_mat = datExpr %>% t %>% cor

Correcting the correlation matrix from \(s \in [-1,1]\) to \(s \in [0,1]\) using \(s_{ij}=|bw(i,j)|\)

S = abs(cor_mat)

Clustering

diss_S = 1-S
dend = diss_S %>% as.dist %>% hclust(method='average')
plot(dend, hang=0, labels=FALSE)

Using Dynamic Tree Cut, a dynamic branch cutting approach taken from Dynamic Tree Cut: in-depth description, tests and applications

modules = cutreeDynamicTree(dend, deepSplit=F, minModuleSize=20)
names(modules) = labels(dend)

clusterings[['cor']] = modules

Merging modules with similar expression profiles using the “eigengenes” (1st principal component) of each module and merging similar modules

module_colors = c('gray', viridis(max(modules)))

MEs_output = datExpr %>% t %>% moduleEigengenes(colors=module_colors[as.vector(modules[labels(dend)])+1])
MEs = MEs_output$eigengenes
colnames(MEs) = sapply(colnames(MEs), function(x) paste0('Mod',which(module_colors == substring(x,3))) )

# Merge similar modules
bicor_dist = 1-bicor(MEs)
dend_MEs = bicor_dist %>% as.dist %>% hclust(method='average')
dend_MEs %>% as.dendrogram %>% set('labels', rep('', nrow(bicor_dist))) %>% plot(ylim=c(0, 0.55))
abline(h=0.5, col='#0099cc')
abline(h=0.2, col='#009999')

# Two main modules
tree_cut = cutree(dend_MEs, h=0.5)
top_modules = modules %>% replace(modules %in% (gsub('Mod','', names(tree_cut[tree_cut==1])) %>% as.numeric), 1) %>%
                          replace(modules %in% (gsub('Mod','', names(tree_cut[tree_cut==2])) %>% as.numeric), 2)
clusterings[['cor_top_clusters']] = top_modules

# Closest modules
tree_cut = cutree(dend_MEs, h=0.2)
merged_modules = modules
n=0
for(i in sort(unique(tree_cut))){
  n=n+1
  merged_modules = merged_modules %>% 
                   replace(modules %in% (gsub('Mod','', names(tree_cut[tree_cut==i])) %>% as.numeric), n)
}

clusterings[['cor_merged']] = merged_modules
top_module_colors = c('gray', viridis(max(top_modules)))
merged_module_colors = c('gray', viridis(length(unique(merged_modules))))

dend_colors = data.frame('ID'=names(modules[labels(dend)]),
                         'OriginalModules' = module_colors[modules[dend$order]+1],
                         'MergedModules' = merged_module_colors[merged_modules[dend$order]+1],
                         'TopModules' = top_module_colors[top_modules[dend$order]+1])

dend %>% as.dendrogram(hang=0) %>% set('labels', rep('', nrow(diss_S))) %>% plot(ylim=c(min(dend$height),1))
colored_bars(colors=dend_colors[,-1])

rm(MEs, modules, module_colors, MEs_output, top_modules, merged_modules, tree_cut, top_module_colors, dend_colors, 
   i, diss_S)


Using euclidean distance

cor_mat = datExpr %>% dist

Correcting the correlation matrix to \(s \in [0,1]\) using \(s_{ij}=\frac{dist(i,j)-min(dist)}{max(dist)-min(dist)}\)

S = (cor_mat-min(cor_mat))/(max(cor_mat)-min(cor_mat))

Clustering

dend = S %>% hclust(method='average')
plot(dend, hang=-1, labels=FALSE)

Using Dynamic Tree Cut, a dynamic branch cutting approach taken from Dynamic Tree Cut: in-depth description, tests and applications

modules = cutreeDynamicTree(dend, deepSplit=F, minModuleSize=20)
names(modules) = labels(dend)
clusterings[['euc']] = modules

Merging modules with similar expression profiles using the “eigengenes” (1st principal component) of each module and merging similar modules

module_colors = c('gray', viridis(max(modules)))

MEs_output = datExpr %>% t %>% moduleEigengenes(colors=module_colors[as.vector(modules[labels(dend)])+1])
MEs = MEs_output$eigengenes
colnames(MEs) = sapply(colnames(MEs), function(x) paste0('Mod',which(module_colors == substring(x,3))) )

# Merge similar modules
bicor_dist = 1-bicor(MEs)
dend_MEs = bicor_dist %>% as.dist %>% hclust(method='average')
dend_MEs %>% as.dendrogram %>% set('labels', rep('', nrow(bicor_dist))) %>% plot(ylim=c(0, 0.18))
abline(h=0.15, col='#0099cc')
abline(h=0.04, col='#009999')

# Two main modules
tree_cut = cutree(dend_MEs, h=0.15)
top_modules = modules %>% replace(modules %in% (gsub('Mod','', names(tree_cut[tree_cut==1])) %>% as.numeric), 1) %>%
                          replace(modules %in% (gsub('Mod','', names(tree_cut[tree_cut==2])) %>% as.numeric), 2)
clusterings[['euc_top_clusters']] = top_modules

# Closest modules
tree_cut = cutree(dend_MEs, h=0.04)
merged_modules = modules
n=0
for(i in sort(unique(tree_cut))){
  n=n+1
  merged_modules = merged_modules %>% 
                   replace(modules %in% (gsub('Mod','', names(tree_cut[tree_cut==i])) %>% as.numeric), n)
}

clusterings[['euc_merged']] = merged_modules
top_module_colors = c('gray', viridis(max(top_modules)))
merged_module_colors = c('gray', viridis(length(unique(merged_modules))))

dend_colors = data.frame('ID'=names(modules[labels(dend)]),
                         'OriginalModules' = module_colors[modules[dend$order]+1],
                         'MergedModules' = merged_module_colors[merged_modules[dend$order]+1],
                         'TopModules' = top_module_colors[top_modules[dend$order]+1])

dend %>% as.dendrogram(hang=-1) %>% set('labels', rep('', nrow(datExpr))) %>% plot(ylim=c(min(dend$height),0.6))
colored_bars(colors=dend_colors[,-1])

rm(MEs, modules, module_colors, MEs_output, top_modules, merged_modules, tree_cut, top_module_colors, dend_colors, i)





Exploratory analysis of clustering

Adjusted Rand Index comparison

cluster_sim = data.frame(matrix(nrow = length(clusterings), ncol = length(clusterings)))
for(i in 1:(length(clusterings))){
  cluster1 = sub(0, NA, clusterings[[i]]) %>% as.factor
  for(j in (i):length(clusterings)){
    cluster2 = sub(0, NA, clusterings[[j]]) %>% as.factor
    cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
  }
}
colnames(cluster_sim) = names(clusterings)
rownames(cluster_sim) = colnames(cluster_sim)

cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = F, Colv = F, dendrogram = 'none', col=rev(brewer.pal(9,'YlOrRd'))[4:9],
          cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE, 
          cexRow = 1, cexCol = 1, margins = c(12,12))

rm(i, j, cluster1, cluster2, cluster_sim)

PCA

Cluster don’t follow any strong patterns, at least in the first principal components

pca = datExpr %>% prcomp

plot_data = data.frame('ID' = rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 
                       'PC3' = pca$x[,3], 'PC4' = pca$x[,4], 'PC5' = pca$x[,5],
                       'bicor' = sub(0,NA,clusterings[['bicor']]) %>% as.factor, 
                       'bicor_top_clusters' = sub(0,NA,clusterings[['bicor_top_clusters']]) %>% as.factor, 
                       'bicor_merged' = sub(0,NA,clusterings[['bicor_merged']]) %>% as.factor, 
                       'cor' = sub(0,NA,clusterings[['cor']]) %>% as.factor, 
                       'cor_top_clusters' = sub(0,NA,clusterings[['cor_top_clusters']]) %>% as.factor, 
                       'cor_merged' = sub(0,NA,clusterings[['cor_merged']]) %>% as.factor,
                       'euc' = sub(0,NA,clusterings[['euc']]) %>% as.factor, 
                       'euc_top_clusters' = sub(0,NA,clusterings[['euc_top_clusters']]) %>% as.factor, 
                       'euc_merged' = sub(0,NA,clusterings[['euc_merged']]) %>% as.factor)

selectable_scatter_plot(plot_data[,-1,], plot_data[,-1])
rm(pca, plot_data)



Conclusions

  • Using Euclidean distance creates clusters defined by their mean expression, perhaps it’s not a good metric because it doesn’t seem to capture any more complex behaviours in the data

  • It is interesting that the Euclidean distance top clusters don’t group together on the 1st PC and instead form stripes along it

  • Correlation based distance metrics seem to capture a type of structure in the data not related to the first principal components

  • I think biweight midcorrelation may be the best choice because it’s more robust than correlation




Save clusterings

clusterings_file = './../../FirstYearReview/Data/Gandal/clusterings.csv'

if(file.exists(clusterings_file)){

  df = read.csv(clusterings_file, row.names=1)
  
  if(!all(rownames(df) == rownames(datExpr))) stop('Gene ordering does not match the one in clusterings.csv!')
  
  for(clustering in names(clusterings)){
    df = df %>% mutate(!!clustering := sub(0, NA, clusterings[[clustering]]))
    rownames(df) = rownames(datExpr)
  }
  
} else {
  
  df = clusterings %>% unlist %>% matrix(nrow=length(clusterings), byrow=T) %>% t %>% data.frame %>% na_if(0)
  colnames(df) = names(clusterings)
  rownames(df) = rownames(datExpr)

}

write.csv(df, file=clusterings_file)

rm(clusterings_file, df, clustering)

Session info

sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Scientific Linux 7.6 (Nitrogen)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] pdfCluster_1.0-3            doParallel_1.0.14          
##  [3] iterators_1.0.9             foreach_1.4.4              
##  [5] WGCNA_1.66                  fastcluster_1.1.25         
##  [7] dynamicTreeCut_1.63-1       sva_3.30.1                 
##  [9] genefilter_1.64.0           mgcv_1.8-26                
## [11] nlme_3.1-137                DESeq2_1.22.2              
## [13] SummarizedExperiment_1.12.0 DelayedArray_0.8.0         
## [15] BiocParallel_1.16.6         matrixStats_0.54.0         
## [17] Biobase_2.42.0              GenomicRanges_1.34.0       
## [19] GenomeInfoDb_1.18.2         IRanges_2.16.0             
## [21] S4Vectors_0.20.1            BiocGenerics_0.28.0        
## [23] biomaRt_2.38.0              gplots_3.0.1               
## [25] dendextend_1.10.0           gridExtra_2.3              
## [27] viridis_0.5.1               viridisLite_0.3.0          
## [29] RColorBrewer_1.1-2          plotlyutils_0.0.0.9000     
## [31] plotly_4.8.0                glue_1.3.1                 
## [33] reshape2_1.4.3              forcats_0.3.0              
## [35] stringr_1.4.0               dplyr_0.8.0.1              
## [37] purrr_0.3.1                 readr_1.3.1                
## [39] tidyr_0.8.3                 tibble_2.1.1               
## [41] ggplot2_3.1.0               tidyverse_1.2.1            
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.1.0           backports_1.1.2        Hmisc_4.1-1           
##   [4] plyr_1.8.4             lazyeval_0.2.2         splines_3.5.2         
##   [7] robust_0.4-18          digest_0.6.18          htmltools_0.3.6       
##  [10] GO.db_3.7.0            gdata_2.18.0           magrittr_1.5          
##  [13] checkmate_1.8.5        memoise_1.1.0          fit.models_0.5-14     
##  [16] cluster_2.0.7-1        limma_3.38.3           annotate_1.60.1       
##  [19] modelr_0.1.4           prettyunits_1.0.2      colorspace_1.4-1      
##  [22] rrcov_1.4-3            blob_1.1.1             rvest_0.3.2           
##  [25] haven_1.1.1            xfun_0.5               crayon_1.3.4          
##  [28] RCurl_1.95-4.10        jsonlite_1.6           impute_1.56.0         
##  [31] survival_2.43-3        gtable_0.2.0           zlibbioc_1.28.0       
##  [34] XVector_0.22.0         kernlab_0.9-27         prabclus_2.2-7        
##  [37] DEoptimR_1.0-8         abind_1.4-5            scales_1.0.0          
##  [40] mvtnorm_1.0-7          DBI_1.0.0              Rcpp_1.0.1            
##  [43] xtable_1.8-3           progress_1.2.0         htmlTable_1.11.2      
##  [46] magic_1.5-9            foreign_0.8-71         bit_1.1-14            
##  [49] mclust_5.4             preprocessCore_1.44.0  Formula_1.2-3         
##  [52] htmlwidgets_1.3        httr_1.4.0             fpc_2.1-11.1          
##  [55] acepack_1.4.1          modeltools_0.2-22      pkgconfig_2.0.2       
##  [58] XML_3.98-1.11          flexmix_2.3-15         nnet_7.3-12           
##  [61] locfit_1.5-9.1         tidyselect_0.2.5       rlang_0.3.2           
##  [64] AnnotationDbi_1.44.0   munsell_0.5.0          cellranger_1.1.0      
##  [67] tools_3.5.2            cli_1.1.0              generics_0.0.2        
##  [70] RSQLite_2.1.1          broom_0.5.1            geometry_0.4.0        
##  [73] evaluate_0.13          yaml_2.2.0             knitr_1.22            
##  [76] bit64_0.9-7            robustbase_0.93-0      caTools_1.17.1        
##  [79] whisker_0.3-2          xml2_1.2.0             compiler_3.5.2        
##  [82] rstudioapi_0.7         geneplotter_1.60.0     pcaPP_1.9-73          
##  [85] stringi_1.4.3          lattice_0.20-38        trimcluster_0.1-2.1   
##  [88] Matrix_1.2-15          pillar_1.3.1           data.table_1.12.0     
##  [91] bitops_1.0-6           R6_2.4.0               latticeExtra_0.6-28   
##  [94] KernSmooth_2.23-15     codetools_0.2-15       MASS_7.3-51.1         
##  [97] gtools_3.5.0           assertthat_0.2.1       withr_2.1.2           
## [100] GenomeInfoDbData_1.2.0 diptest_0.75-7         hms_0.4.2             
## [103] grid_3.5.2             rpart_4.1-13           class_7.3-14          
## [106] rmarkdown_1.12         lubridate_1.7.4        base64enc_0.1-3